Materials and Method with Result and Discussion

The packages; dplyr, plyr , rehsape, reshape2, ggplot were applied for analysis the crop heal data (H. Wickham and Francois 2015; H. Wickham 2011; Wickham and Hadley 2007)

#==============
library(dplyr) # arrange data structure
library(plyr)
library(reshape)
library(reshape2)
library(lubridate)
#================
library(ggplot2)  # plotting
library(gridExtra)
library(scales)
library(cowplot)
#===============
library(bioDist) # Co-ocurrance analysis
library(vegan) # Co-ocurrance analysis
#==============
library(igraph) # Network analysis package

Crop health suyrvey data

Survey data were stored at the google drive under the this link.

library(RCurl) # run this package for load the data form the website 

file <- getURL("https://docs.google.com/spreadsheets/d/1zB7gNdI7Nk7SuHuPWcjzaKnjuwkvL6sOVMo0zMfuV-c/pub?gid=558862364&single=true&output=csv") # load data from the google drive

data <- read.csv(text = file) # read data which is formated as the csv
#======================================================================
data[data == "-"] <- NA # replace '-' with NA

data[data == ""] <- NA # replace 'missing data' with NA

#==== to lower variable names ====
names(data) <- tolower(names(data)) # for more consistancy

#======================================================================
# remove the variables that are not included for analysis
data$phase <- NULL # there is only one type yype of phase in the survey
data$identifier <- NULL # this variable is not included in the analysis
data$village <- NULL # remove name of village
data$year <- NULL # remove year data
data$season <- NULL # remove season data
data$lat <- NULL # remove latitude data
data$long <- NULL # remove longitude data
data$fa <- NULL # field area is not include in the analysis
data$fn <- NULL # farmer name can not be included in this survey analysis
data$fp <- NULL # I do not know what is fp
data$lfm <- NULL # there is only one type of land form in this survey
data$ced <- NULL # Date data can not be included in the network analysis
data$cedjul <- NULL # remove crop establisment julian date data
data$hd <- NULL # Date data can not be included in the network analysis
data$hdjul <- NULL # remove harvest julian date
data$cvr <- NULL # reove crop varieties
data$varcoded <- NULL # I will recode them 
data$fymcoded <- NULL # remove code data of fym
data$mu <- NULL # no record of mullucicide data
data$nplsqm <- NULL # remove number of plant per square meter
data$rbpx <- NULL # no record of rice bug p


#======================================================================
#=================== corract the variable type ========================
#======================================================================

data <- transform(data, 
                  country = as.factor(country),
                  pc = as.factor(pc),
                  cem = as.factor(cem),     
                  ast = as.factor(ast),       
                  ccd = as.numeric(ccd),
                  vartype = as.factor(vartype),
                  fym = as.character(fym),
                  n = as.numeric(n),
                  p = as.numeric(p) ,
                  k = as.numeric(k),
                  mf = as.numeric(mf),        
                  wcp = as.factor(wcp),      
                  iu = as.numeric(iu),     
                  hu = as.numeric(hu),      
                  fu = as.numeric(fu),      
                  cs  = as.factor(cs),      
                  ldg  =  as.numeric(ldg),  
                  yield = as.numeric(yield) ,
                  dscum = as.factor(dscum),   
                  wecum = as.factor(wecum),   
                  ntmax = as.numeric(ntmax), 
                  npmax = as.numeric(npmax),    
                  nltmax = as.numeric(nltmax),  
                  nlhmax = as.numeric(nltmax),  
                  waa = as.numeric(waa),      
                  wba = as.numeric(wba) ,   
                  dhx =  as.numeric(dhx),  
                  whx =  as.numeric(whx),     
                  ssx  = as.numeric(ssx),  
                  wma = as.numeric(wma), 
                  lfa = as.numeric(lfa),
                  lma = as.numeric(lma),   
                  rha  = as.numeric(rha) ,
                  thrx = as.numeric(thrx),    
                  pmx = as.numeric(pmx),    
                  defa  = as.numeric(defa),
                  bphx = as.numeric(bphx),   
                  wbpx = as.numeric(wbpx),    
                  awx  = as.numeric(awx), 
                  rbx =as.numeric(rbx),   
                  rbbx = as.numeric(rbbx),  
                  glhx  = as.numeric(glhx), 
                  stbx=as.numeric(stbx),    
                  hbx= as.numeric(hbx),
                  bbx = as.numeric(bbx),    
                  blba = as.numeric(blba),    
                  lba = as.numeric(lba),    
                  bsa = as.numeric(bsa),    
                  blsa = as.numeric(blsa),  
                  nbsa = as.numeric(nbsa),  
                  rsa  = as.numeric(rsa),   
                  lsa = as.numeric(lsa),    
                  shbx = as.numeric(shbx) ,  
                  shrx = as.numeric(shrx),    
                  srx= as.numeric(srx),    
                  fsmx = as.numeric(fsmx),   
                  nbx =  as.numeric(nbx),   
                  dpx = as.numeric(dpx),    
                  rtdx  = as.numeric(rtdx),  
                  rsdx  = as.numeric(rsdx),
                  gsdx  =as.numeric(gsdx),   
                  rtx = as.numeric(rtx)
) 

#======================================================================
#=================== corract the variable type ========================
#======================================================================

# Now data are in the right format and ready to further analysis, but there are some variables needed to code as the number not character
# Before proforming cluster analysis which is the further analysis, I need to code the character to number

##### recode the previous crop

#if previosu crop data are rice, they will be coded as 1, but others, not rice, they will be coded as 0.

data$pc <- ifelse(data$pc == "rice", 1, 0)
recode the crop establisment mothods

Transplanting rice is 1, and direct seeded rice is 2.

#Crop establisment method
levels(data$cem)[levels(data$cem) == "trp"] <- 1
levels(data$cem)[levels(data$cem) == "TPR"] <- 1
levels(data$cem)[levels(data$cem) == "DSR"] <- 2
levels(data$cem)[levels(data$cem) == "dsr"] <- 2
fym

fym there are two type 0 and 1, raw data are recorded as no, yes, and value, if the value is 0 which mean 0 and if the value more than 0 which means 1

data$fym <- ifelse(data$fym == "no", 0, 
                   ifelse(data$fym == "0", 0, 1
                   )
)
Vartype

Vartype there are three type treditional varieties coded as 1, modern varities coded as 2 and hybrid coded as 3.

data$vartype <- ifelse(data$vartype == "tv", 1,
                       ifelse(data$vartype == "mv", 2,
                              ifelse(data$vartype == "hyb", 3, NA
                              )
                       )
)
Weed control practices

Weed management practices have three type, hand coded as 1, herb coded as 2, and herb-hand coded as 3.

# wcp weed control management
levels(data$wcp)[levels(data$wcp) == "hand"] <- 1

levels(data$wcp)[levels(data$wcp) == "herb"] <- 2

levels(data$wcp)[levels(data$wcp) == "herb-hand"] <- 3
Crop status

Crop status have five level, very poor coded as 1, poor coded as 2, average coded as 3, good coded as 4 and very good coded as 5.

# Crop Status
levels(data$cs)[levels(data$cs) == "very poor"] <- 1

levels(data$cs)[levels(data$cs) == "poor"] <- 2

levels(data$cs)[levels(data$cs) == "average"] <- 3

levels(data$cs)[levels(data$cs) == "good"] <- 4

levels(data$cs)[levels(data$cs) == "very good"] <- 5

Pre-process before performing cluster analysis

all data should be numeric data

#clean the data
num.data <- apply(data[, -c(1,2)], 2, as.numeric) # create dataframe to store the numerical transformation of raw data excluded fno and country

num.data <- as.data.frame(as.matrix(num.data)) # convert from vector to matrix

data <- cbind(data[ , c("fno", "country")], num.data)

data <- data[ , apply(data[, -c(1,2)], 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0

data <- data[complete.cases(data), ] # exclude row which cantain NA
m.data <- melt(data[, !names(data) %in% c("fno", "country")])

varnames <- colnames(data[, !names(data) %in% c("fno", "country")])

 p <- list()
 
for(i in 1:length(varnames)) {
  
        gdata <- m.data %>% filter(variable == varnames[i])
        p[[i]] <- ggplot(gdata, aes(x = value)) + 
        geom_histogram(stat = "bin") + ggtitle(paste("Histogram of", varnames[i], sep = " "))
        
}
 
plot_grid(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], p[[6]], p[[7]], p[[8]], p[[9]], p[[10]],
          p[[11]], p[[12]], p[[13]], p[[14]], p[[15]], p[[16]], p[[17]], p[[18]], p[[19]], p[[20]],
          p[[21]], p[[22]], p[[23]], p[[24]], p[[25]], p[[26]], p[[27]], p[[28]], p[[29]], p[[30]],
          p[[31]], p[[32]], p[[33]], p[[34]], p[[35]], p[[36]], p[[37]], p[[38]], p[[39]], p[[40]],
          p[[41]], p[[42]], p[[43]], p[[44]], p[[45]], p[[46]], p[[47]], p[[48]], p[[49]], p[[50]],
          p[[51]], p[[52]], p[[53]], p[[54]], ncol = 3, align = "v")

#head(data)
# select only the variables related to the injury profiles

start.IP <- "dhx" # set to read the data from column named "dhx"

end.IP <- "rtx" # set to read the data from column named "rtx"

start.col.IP <- match(start.IP, names(data)) # match function for check which column of the data mactch the column named following the conditons above

end.col.IP <- match(end.IP, names(data)) # match function for check which column of the data mactch the column named following the conditons above

IP.data <- data[start.col.IP:end.col.IP] # select the columns of raw data which are following the condition above

#==== pre-process before run correlaiton function
# because the correlaiton measures will not allow to perform with variables whihc have not variance.

IP.data <- IP.data[ ,apply(IP.data, 2, var, na.rm = TRUE) != 0] # exclude the column (variables) with variation = 0

country <- data$country  #combine two cloumn names country and PS

IP.data <- cbind(country, IP.data)

IP.data[is.na(IP.data)] <- 0

name.country <- as.vector(unique(IP.data$country))

construct the global network of injury profiles

global netowork

global.results <- matrix(nrow = 0, ncol = 6)

for(b in 2:(dim(IP.data)[2]-1)){
    #every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
    for(c in (b + 1):(dim(IP.data)[2])){
      
      #summing the abundances of species of the columns that will be compared
      species1.ab <- sum(IP.data[,b])
      
      species2.ab <- sum(IP.data[,c])
      #if the column is all 0's no co-occurrence will be performed
      if(species1.ab > 1 & species2.ab > 1){
        
        test <- cor.test(IP.data[,b], IP.data[,c], method = "spearman", na.action = na.rm, exact = FALSE)
        # There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
        # stackoverflow.com/questions/10711395/spear-man-correlation and ties
        # It would be still valid if the data is not normally distributed.
       
         rho <- test$estimate
        
         p.value <- test$p.value
      }
      
      if(species1.ab <=1 | species2.ab <= 1){
       
         rho <- 0
        
        p.value <- 1
      } 
      
      new.row <- c( names(IP.data)[b], names(IP.data)[c], rho, p.value, species1.ab, species2.ab)
      
      global.results <- rbind(global.results, new.row)          
      
    }
    
}

global.results <- data.frame(data.matrix(global.results))
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276
## --> row.names NOT used
names(global.results) <- c("var1","var2","rho","p.value","ab1","ab2")

#making sure certain variables are factors
global.results$var1 <- as.character(as.factor(global.results$var1))
global.results$var2 <- as.character(as.factor(global.results$var2))
global.results$rho <- as.numeric(as.character(global.results$rho))
global.results$p.value <- as.numeric(as.character(global.results$p.value))
global.results$ab1 <- as.numeric(as.character(global.results$ab1))
global.results$ab2 <- as.numeric(as.character(global.results$ab2))

head(global.results)
##   var1 var2         rho     p.value   ab1   ab2
## 1  dhx  whx  0.13481056 0.006654425 10072 16947
## 2  dhx  ssx  0.13906638 0.005107772 10072  2824
## 3  dhx defa  0.12487379 0.012005084 10072  4565
## 4  dhx bphx -0.09582190 0.054297060 10072 14160
## 5  dhx wbpx -0.08930876 0.072955895 10072  2076
## 6  dhx  awx  0.04379802 0.379931219 10072    87
  sub_global.results <- subset(global.results, as.numeric(as.character(abs(rho))) > 0.2)

  gnet <- graph.edgelist(as.matrix(sub_global.results[, 1:2]), directed = FALSE)
#== adjust layout

  l <- layout.fruchterman.reingold(gnet)
#== adjust vertices
  V(gnet)$color <- "tomato"
  V(gnet)$frame.color <- "gray40"
  V(gnet)$shape <- "circle"
  V(gnet)$size <- 25
  V(gnet)$label.color <- "white"
  V(gnet)$label.font <- 2
  V(gnet)$label.family <- "Helvetica"
  V(gnet)$label.cex <- 0.7

#== adjust the edge
  E(gnet)$weight <- as.matrix(sub_global.results$rho)
  
  E(gnet)$width <- 1 + E(gnet)$weight*5
  
  col <- c("firebrick3", "forestgreen")
  
  colc <- cut(sub_global.results$rho, breaks = c(-1, 0, 1), include.lowest = TRUE)
  
  E(gnet)$color <- col[colc]

#== plot network model
  plot(gnet, layout = l * 1.0, main =  "single network model of injuries profiles")

Network model of injury profiles by country

#=====co_occurrence_pairwise.R====
country.results <- matrix(nrow = 0, ncol = 7) # create results to store the outputs

options(warnings = -1) # setting not to show the massages as the warnings

for(a in 1:length(name.country)){
  # subset the raw data by groups of countries and cluster of production situation
  country.temp <- name.country[a]
  #subset the dataset for those treatments
  temp <- subset(IP.data, country == country.temp)
  
  #in this case the community data started at column 6, so the loop for co-occurrence has to start at that point
  for(b in 2:(dim(temp)[2]-1)){
    #every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
    for(c in (b + 1):(dim(temp)[2])){
      
      #summing the abundances of species of the columns that will be compared
      species1.ab <- sum(temp[,b])
      
      species2.ab <- sum(temp[,c])
      #if the column is all 0's no co-occurrence will be performed
      if(species1.ab > 1 & species2.ab > 1){
        
        test <- cor.test(temp[,b], temp[,c], method = "spearman", na.action = na.rm, exact = FALSE)
        # There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
        # stackoverflow.com/questions/10711395/spear-man-correlation and ties
        # It would be still valid if the data is not normally distributed.
       
         rho <- test$estimate
        
         p.value <- test$p.value
      }
      
      if(species1.ab <=1 | species2.ab <= 1){
       
         rho <- 0
        
        p.value <- 1
      } 
      
      new.row <- c(name.country[a], names(temp)[b], names(temp)[c], rho, p.value, species1.ab, species2.ab)
      
      country.results <- rbind(country.results, new.row)            
      
    }
    
  }
 
   print(a/length(name.country))
  
}
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
country.results <- data.frame(data.matrix(country.results))

names(country.results) <- c("country","var1","var2","rho","p.value","ab1","ab2")

#making sure certain variables are factors
country.results$country <- as.factor(country.results$country)
country.results$taxa1 <- as.character(as.factor(country.results$var1))
country.results$taxa2 <- as.character(as.factor(country.results$var2))
country.results$rho <- as.numeric(as.character(country.results$rho))
country.results$p.value <- as.numeric(as.character(country.results$p.value))
country.results$ab1 <- as.numeric(as.character(country.results$ab1))
country.results$ab2 <- as.numeric(as.character(country.results$ab2))

head(country.results)
##   country var1 var2         rho      p.value  ab1  ab2 taxa1 taxa2
## 1     PHL  dhx  whx  0.06524012 0.6891879742 1307 2089   dhx   whx
## 2     PHL  dhx  ssx -0.12272044 0.4506097948 1307  109   dhx   ssx
## 3     PHL  dhx defa  0.18989475 0.2405423584 1307 1941   dhx  defa
## 4     PHL  dhx bphx -0.11699678 0.4721633481 1307 1286   dhx  bphx
## 5     PHL  dhx wbpx -0.50743562 0.0008316712 1307   84   dhx  wbpx
## 6     PHL  dhx  awx  0.00000000 1.0000000000 1307    1   dhx   awx
  sub_country.results <- subset(country.results, as.numeric(as.character(abs(rho))) > 0.2)

  results_sub.by.country <- list()
  
  for(i in 1: length(name.country)){
  
  results_sub.by.country[[i]] <- subset(sub_country.results, country == name.country[i])
  }
### Network analysis

# head(results_sub.by.group[[1]][,2:3]) # get the list
#layout(matrix(c(1:14), 9, 2, byrow = TRUE))
  cnet  <- list()

for(i in 1:length(name.country)){
  
  cnet[[i]] <- graph.edgelist(as.matrix(results_sub.by.country[[i]][, 2:3]), directed = FALSE)
#== adjust layout

  l <- layout.circle(cnet[[i]])
#== adjust vertices
  V(cnet[[i]])$color <- "tomato"
  V(cnet[[i]])$frame.color <- "gray40"
  V(cnet[[i]])$shape <- "circle"
  V(cnet[[i]])$size <- 25
  V(cnet[[i]])$label.color <- "white"
  V(cnet[[i]])$label.font <- 2
  V(cnet[[i]])$label.family <- "Helvetica"
  V(cnet[[i]])$label.cex <- 0.7

#== adjust the edge
  E(cnet[[i]])$weight <- as.matrix(results_sub.by.country[[i]][, 4])
  
  E(cnet[[i]])$width <- 1 + E(cnet[[i]])$weight*5
  
  col <- c("firebrick3", "forestgreen")
  
  colc <- cut(results_sub.by.country[[i]]$rho, breaks = c(-1, 0, 1), include.lowest = TRUE)
  
  E(cnet[[i]])$color <- col[colc]

#== plot network model
  plot(cnet[[i]], layout = l * 1.0, main = paste( "network model of", name.country[i]))
}

  network.value <- list()
for(i in 1:length(cnet)){
  
  E(cnet[[i]])$weight <- abs(as.matrix(results_sub.by.country[[i]][, 4]))
  
  network.value[[i]] <- data.frame(
  country = name.country[i],
  node = V(cnet[[i]])$name,
  degree = degree(cnet[[i]]),
  betweenness = betweenness(cnet[[i]]),
  closeness = closeness(cnet[[i]]),
  eigenvector = evcent(cnet[[i]])$vector,
  clusterCoef = transitivity(cnet[[i]],type=c("local"))
  )
network.value[[i]]$res = as.vector(lm(eigenvector ~ betweenness, data = network.value[[i]])$residuals)
  }
  
  network.value <- as.data.frame(do.call("rbind", network.value))
  row.names(network.value ) <- NULL

Construction of network of injury profiles seperated by country

# compute the topological properties
m.net.value <- melt(network.value[1:7])
## Using country, node as id variables
m.net.value  %>% ggplot(aes(x= value, y = reorder(node, value))) +
         geom_point(size = 3, aes(color = country)) +
        theme_bw() +
        theme(panel.grid.major.x =  element_blank(),
              panel.grid.minor.x =  element_blank(),
              panel.grid.major.y = element_line(color = "grey", linetype = 3)) +
        ylab("Variables")  +
  facet_grid(.~variable, scale = "free")
## Warning: Removed 8 rows containing missing values (geom_point).

Key Actor Analysis

(Willocquet et al. 2004)

layout(matrix(c(1:9), 5, 2, byrow = TRUE))
## Warning in matrix(c(1:9), 5, 2, byrow = TRUE): data length [9] is not a
## sub-multiple or multiple of the number of rows [5]
for(i in 1:length(name.country)){
  
 plot <- network.value %>% 
   filter(country == name.country[i]) %>%
   ggplot(aes(x = betweenness, y = eigenvector,
               label = node,
               color = res,
               size = abs(res))
          ) +
    xlab("Betweenness Centrality") +
    ylab("Eigencvector Centrality") +
    geom_text() +
    ggtitle(paste("Key Actor Analysis for Injuiry Profiles of", name.country[i]))

print(plot) 
}

Reference

Wickham, Hadley. 2011. “The Split-Apply-Combine Strategy for Data Analysis.” Journal of Statistical Software 40 (1): 1–29. http://www.jstatsoft.org/v40/i01/.

Wickham, Hadley, and Romain Francois. 2015. Dplyr: A Grammar of Data Manipulation. http://CRAN.R-project.org/package=dplyr.

Wickham, and Hadley. 2007. “Reshaping Data with the Reshape Package.” Journal of Statistical Software 21 (12). http://www.jstatsoft.org/v21/i12/paper.

Willocquet, Laetitia, Francisco A Elazegui, Nancy Castilla, Luzviminda Fernandez, Kenneth S Fischer, ShaoBing Peng, Paul S Teng, et al. 2004. “Research Priorities for Rice Pest Management in Tropical Asia: A Simulation Analysis of Yield Losses and Management Efficiencies.” Phytopathology 94 (7). Am Phytopath Society: 672–82.